In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()

%matplotlib inline 

from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[1]:
In [2]:
df = pd.read_csv("cleaned.csv")
df0 = df
tran = pd.read_csv("train_2016_v2.csv")
In [3]:
#clean up the transaction dates by leaving only the month for easier graphics
yearmonth = []
for i in range(0,len(tran)):
    yearmonth.append(tran["transactiondate"][i][0:7])
    
tran["yearmonth"] = yearmonth

tran  =tran.sort_values(by=['yearmonth'])
In [4]:
transaction = pd.merge(tran,df, on='parcelid', how='inner')
transaction =transaction.sort_values(by=['yearmonth'])
In [5]:
#transaction.groupby('fips').count()["parcelid"]
#fips = county code
#6037 = los angeles
#6059 = Orange county
#6111 = Ventura County
In [6]:
jan = transaction[transaction["yearmonth"] == '2016-01'].groupby('fips').count()["parcelid"].values.tolist()
feb = transaction[transaction["yearmonth"] == '2016-02'].groupby('fips').count()["parcelid"].values.tolist()
march = transaction[transaction["yearmonth"] == '2016-03'].groupby('fips').count()["parcelid"].values.tolist()
april = transaction[transaction["yearmonth"] == '2016-04'].groupby('fips').count()["parcelid"].values.tolist()
may = transaction[transaction["yearmonth"] == '2016-05'].groupby('fips').count()["parcelid"].values.tolist()
june = transaction[transaction["yearmonth"] == '2016-06'].groupby('fips').count()["parcelid"].values.tolist()
july = transaction[transaction["yearmonth"] == '2016-07'].groupby('fips').count()["parcelid"].values.tolist()
aug = transaction[transaction["yearmonth"] == '2016-08'].groupby('fips').count()["parcelid"].values.tolist()
sep = transaction[transaction["yearmonth"] == '2016-09'].groupby('fips').count()["parcelid"].values.tolist()
octo = transaction[transaction["yearmonth"] == '2016-10'].groupby('fips').count()["parcelid"].values.tolist()
nov = transaction[transaction["yearmonth"] == '2016-11'].groupby('fips').count()["parcelid"].values.tolist()
dec = transaction[transaction["yearmonth"] == '2016-12'].groupby('fips').count()["parcelid"].values.tolist()

pcts = [jan,feb,march,april, may,june, july,octo, sep, nov, dec]
l1= []
l2 = []
l3 = []
for i in pcts:
    l1.append(i[0])
    l2.append(i[1])
    l3.append(i[2])

Zillow's California Housing Market

When and where are the transactions located?

  • There is high volumne of transactiosn happening in May, June, and July
  • Most of these transactions occur in Los Angeles
In [7]:
import plotly
import plotly.graph_objs as go
x = ["January", "February", "March","April","May", "June","July","August", "September", "October","November","December"]
trace1 = go.Bar(
    x=x,
    y=l1,
    name='Los Angeles'
)
trace2 = go.Bar(
    x=x,
    y=l2,
    name='Orange County'
)
trace3 = go.Bar(
    x=x,
    y=l3,
    name='Ventura County'
)

data = [trace1, trace2,trace3]
layout = go.Layout(
    barmode='stack',
    title = "Number of Transactions by Month"
)

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='stacked-bar')
In [8]:
transaction["latitude"] = transaction["latitude"]/1000000
transaction["longitude"] = transaction["longitude"]/1000000

np.random.seed(11452)
train = np.random.rand(len(transaction)) <0.5
trans_train = transaction[train]
#len(trans_train)

Geolcation Plot with Prices of Properties

  • The first geolocation plot indicates that transactions are located in Los Angeles and roughly cost around the 1 million USD threshold.
  • Ventura and orange county seem to cover a small part of the transactions but more of the higher priced properties.
  • A small cluster of high price property transactions are in malibu, beverly hills, and Newport beach.
In [39]:
import plotly
from plotly.plotly import iplot
import plotly.graph_objs as go

mapbox_access_token = 'pk.eyJ1IjoiYmlwYXJ0aXRlaHlwZXJjdWJlIiwiYSI6ImNqczFjZzUydjF0MGc0OW1sYWIwYW5lY2UifQ.l6OVeSa3poXp6S4s8km8kA'
map_style = "mapbox://styles/bipartitehypercube/cjs26r8sw1cy31fqjhopqigtj"

scl = [0,"rgb(0,0,0)"],[0.25,"rgb(0, 0, 128)"],[0.5,"rgb(0, 25, 255)"],\
[0.75,"rgb(0, 0, 205)"],[1,"rgb(250, 250, 250)"]


data = [go.Scattermapbox(
            lat=trans_train['latitude'],
            lon=trans_train['longitude'],
            mode='markers',
            marker = dict(color = (trans_train['taxvaluedollarcnt']),colorscale = scl,reversescale = True,opacity = 0.7,size = 5,
            colorbar = dict(thickness = 10,titleside = "right",
                            outlinecolor = "rgba(68, 68, 68, 0)",
                            ticks = "outside",ticklen = 3,
                            showticksuffix = "last",
                        ticksuffix = " Dollars",dtick = 1000000)))]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(lat=34,lon=-118.5),
        style= map_style,
        pitch=0,
        zoom=7,),
    title = 'Transaction location with Property Price',
)



fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='Geolocation of properties') 

Geolocation Plot with Size of the Property

  • The geolocation plot shows the size of the house based on the location people are buying.
  • The size of the house isn't directly correlated with the price since the suburbs around Los Angeles seems to be bigger and cheaper. Specifically around the Palm dale, Yorba linda, and Villa Park.
  • Ventura and Orange county also includes the larger and more expensive properties in California
In [10]:
import plotly
from plotly.plotly import iplot
import plotly.graph_objs as go

mapbox_access_token = 'pk.eyJ1IjoiYmlwYXJ0aXRlaHlwZXJjdWJlIiwiYSI6ImNqczFjZzUydjF0MGc0OW1sYWIwYW5lY2UifQ.l6OVeSa3poXp6S4s8km8kA'
map_style = "mapbox://styles/bipartitehypercube/cjs26r8sw1cy31fqjhopqigtj"

scl = [0,"rgb(0,0,0)"],[0.25,"rgb(0, 0, 128)"],[0.5,"rgb(0, 25, 255)"],\
[0.75,"rgb(0, 0, 205)"],[1,"rgb(250, 250, 250)"]


data = [go.Scattermapbox(
            lat=trans_train['latitude'],
            lon=trans_train['longitude'],
            mode='markers',
            marker = dict(color = (trans_train["calculatedfinishedsquarefeet"]),colorscale = scl,reversescale = True,opacity = 0.7,size = 5,
            colorbar = dict(thickness = 10,titleside = "right",
                            outlinecolor = "rgba(68, 68, 68, 0)",
                            ticks = "outside",ticklen = 3,
                            showticksuffix = "last",
                        ticksuffix = " Square Feet",dtick = 700)))]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(lat=34,lon=-118.5),
        style= map_style,
        pitch=0,
        zoom=7,),
    title = 'Transaction location with Property Size',
)



fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='Geolocation of properties') 

Geolocation Plot with Price Per Square Feet

  • The plot shows a relationship with the two previous plots
  • Very interestingly, the price per square feet in both the Ventura conuty and Orange county are much cheaper.
  • The properties that people are purchasing in central Los Angeles are relatively low.
  • The cluster around LA Palmdale airport seems to be relatively cheap per square feet but large in property size.
In [41]:
import plotly
from plotly.plotly import iplot
import plotly.graph_objs as go

mapbox_access_token = 'pk.eyJ1IjoiYmlwYXJ0aXRlaHlwZXJjdWJlIiwiYSI6ImNqczFjZzUydjF0MGc0OW1sYWIwYW5lY2UifQ.l6OVeSa3poXp6S4s8km8kA'
map_style = "mapbox://styles/bipartitehypercube/cjs26r8sw1cy31fqjhopqigtj"

scl = [0,"rgb(0,0,0)"],[0.25,"rgb(0, 0, 128)"],[0.5,"rgb(0, 25, 255)"],\
[0.75,"rgb(0, 0, 205)"],[1,"rgb(250, 250, 250)"]


data = [go.Scattermapbox(
            lat=trans_train['latitude'],
            lon=trans_train['longitude'],
            mode='markers',
            marker = dict(color = (trans_train['taxvaluedollarcnt']/trans_train["calculatedfinishedsquarefeet"]),colorscale = scl,reversescale = True,opacity = 0.7,size = 5,
            colorbar = dict(thickness = 10,titleside = "right",
                            outlinecolor = "rgba(68, 68, 68, 0)",
                            ticks = "outside",ticklen = 3,
                            showticksuffix = "last",
                        ticksuffix = " Dollars Per Square Feet",dtick = 700)))]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(lat=34,lon=-118.5),
        style= map_style,
        pitch=0,
        zoom=7,),
    title = 'Transaction location',
)



fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig, filename='Geolocation of properties') 
In [12]:
freq = df["yearbuilt"].value_counts()
freq = freq.values
freq = freq.tolist()

years = df["yearbuilt"].unique()
years = years.tolist()

years1 = transaction["yearbuilt"].unique()
years1 = years1.tolist()

freq1 = transaction["yearbuilt"].value_counts()
freq1 = freq1.values
freq1 = freq1.tolist()

ly0 = []
for i in years:
    ly0.append(int(i))
    
ly1 = []
for i in years1:
    ly1.append(int(i))
In [13]:
overall = pd.DataFrame({'yearbuilt': ly0,'Marketprop': freq})
transact = pd.DataFrame({'yearbuilt': ly1,'Transactionofprop': freq1})

comparison = pd.merge(overall,transact, on = 'yearbuilt', how = 'outer')
comparison = comparison.fillna(0)
#comparison['yearbuilt'] = comparison['yearbuilt'].apply(str)
In [14]:
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()
column_names_to_normalize = ['Marketprop','Transactionofprop']
x = comparison[column_names_to_normalize].values
x_scaled = min_max_scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = comparison.index)
comparison[column_names_to_normalize] = df_temp
In [15]:
#comparison[comparison["Transactionofprop"] > 0.8]
#comparison[comparison["Marketprop"] > 0.8]

Does the year built of the property affect the purchase of these houses?

  • The left plot (blue line) indicates that the overal supply of the market based on the age of the house is from 1900 to 2000s.

  • The left plot (orange line) indicates that transactions do overlay the supply of properties in the market. However, the two line plots spikes at different areas.

  • The orangle line in the left plot show that consumers like houses built around 1950s to 1960s and from 1980s to 1990s. This may indicate buying houses from 1960s for remodeling and 1980s for living.

  • The right plot shows the prediction error of housing prices by Zillow. The Zillow estimates are relatively accurate. By locating the high fluctions in prediction error we can understand that the house prices are highly volatile from the 1880s to 1940s.

In [38]:
import matplotlib.pyplot as plt
from matplotlib import pylab
import datetime
import numpy as np
import seaborn as sns; sns.set()

palette = dict(zip([1,2],
                   sns.color_palette("nipy_spectral_r", 4)))
fig, (ax1, ax2)= plt.subplots(ncols = 2,figsize=(16, 5))
ax1 = sns.lineplot(comparison['yearbuilt'],comparison["Marketprop"], data = comparison, palette = palette,ax=ax1, label = "Property Supply")
leg =ax1.legend();
ax1 = sns.lineplot(comparison['yearbuilt'],comparison["Transactionofprop"], data = comparison, palette = palette,ax=ax1, label = "Property Demand").set_title('Properties by Year Built')
ax2 = sns.lineplot(transaction['yearbuilt'],abs(transaction["logerror"]), data = transaction, ax = ax2, label = "Log Error").set_title("Logerror Fluctuations by Year Built")